Simple sketches showing different possible mark distributions.
Note that this is a working notebook, off the top of my head, and I haven't necessarily checked things properly... That said, even if there are errors, much of the stuff contained in the notebook could be useful when it comes to doing things properly...
# pandas is a package for working with 2d tabular data
import pandas as pd
# numpy is a package for doing things with numbers...
import numpy as np
# ipywidgets are interactive widgets
from ipywidgets import interact
# Plotting support
import matplotlib.pyplot as plt
# seaborn is a statistical charting package
import seaborn as sns
# Inline charting
%matplotlib inline
Derive random distribution of OCAS vs OES with a known correlation:
# Based on https://stackoverflow.com/a/18684433/454773
def simpleCorrelation(corr=1.0,
lower_ocas=30, upper_ocas=95,
lower_oes=10, upper_oes=90):
"""Simple correlations, with sd limits on the distribution."""
ocas = np.array([lower_ocas, upper_ocas])
oes = np.array([lower_oes, upper_oes])
means = [ocas.mean(), oes.mean()]
stds = [ocas.std() / 3, oes.std() / 3]
covs = [[stds[0]**2, stds[0]*stds[1]*corr],
[stds[0]*stds[1]*corr,stds[1]**2]]
df = pd.DataFrame(np.random.multivariate_normal(means, covs, 1000))
df.columns = ['OCAS', 'OES']
df['OCAS'] = df['OCAS'].astype(int)
df['OES'] = df['OES'].astype(int)
df['Rank'] = (df['OCAS'] + df["OES"]) / 2
return df
Preview some of the data that is returned:
df = simpleCorrelation(0.9)
df.head()
OCAS | OES | Rank | |
---|---|---|---|
0 | 70 | 66 | 68.0 |
1 | 68 | 55 | 61.5 |
2 | 85 | 71 | 78.0 |
3 | 57 | 44 | 50.5 |
4 | 47 | 32 | 39.5 |
We can can plot the data, with distributions, easily enough:
g = sns.jointplot("OCAS", "OES", data=df,
kind="reg", truncate=False,
xlim=(0, 100), ylim=(0, 100),
color="blue", height=7)
Do a thing to set boundaries:
def get_spirit_of_rank(classification, ocas, oes, rank):
"""Generate spirit of rank lines.
Grades based on:
- the minimum final OCAS for that status;
- the minimum final OES for that status;
- the minimum Rank score for that status
Rank defined for each student as the weighted average of
their final OCAS and final OES.
Rank > OCAS, Rank > OES
To achieve a certain result status, a student
must meet all three boundary scores.
"""
modx = (2 * rank) - oes
mody = (2 * rank) - ocas
_df = pd.DataFrame({'Class': classification,
'x': [100, modx, ocas, ocas],
'y': [oes, oes, mody, 100]})
return _df
sor = get_spirit_of_rank('class1', 85, 79, 87)
sor
Class | x | y | |
---|---|---|---|
0 | class1 | 100 | 79 |
1 | class1 | 95 | 79 |
2 | class1 | 85 | 89 |
3 | class1 | 85 | 100 |
These can be overplotted on the distribution:
jointplot = sns.jointplot("OCAS", "OES", data=df,
kind="reg", truncate=False,
xlim=(0, 100), ylim=(0, 100),
color="blue", height=7)
sor1 = get_spirit_of_rank('class1', 85, 80, 87)
plt.plot(sor1['x'], sor1['y'], color='red', linewidth=2);
Start to put together a thing to let us set grade boundaries (perhaps more convenient to make this interactive):
# The numbers are set as they are to help debugging...
boundaries = pd.DataFrame([(1, 80, 85, 90),
(2, 65, 70, 75),
(3, 50, 55, 60),
(4, 35, 40, 45)],
columns=['class', 'oes', 'ocas', 'rank'])
boundaries
class | oes | ocas | rank | |
---|---|---|---|---|
0 | 1 | 80 | 85 | 90 |
1 | 2 | 65 | 70 | 75 |
2 | 3 | 50 | 55 | 60 |
3 | 4 | 35 | 40 | 45 |
We can then overplot the grade boundaries onto the distribution.
If we also add drop lines, these make it easier to see "false grade" areas, eg where incorrect grades are assigned if we just look at OCAS scores.
jointplot = sns.jointplot("OCAS", "OES", data=df,
kind="reg", truncate=False,
xlim=(0, 100), ylim=(0, 100),
color="blue", height=7)
def overplot(row):
_sor = get_spirit_of_rank(row['class'], row['ocas'], row['oes'], row['rank'])
plt.plot(_sor['x'], _sor['y'], color='red', linewidth=2)
# The drop lines help us identify misclassifications
plt.plot([_sor['x'].iloc[2], _sor['x'].iloc[2]], [0, _sor['y'].iloc[2]],
color='lightgrey')
boundaries.apply(overplot, axis=1);
We can start to make a more interactive explorer, adding in a component that allows us to start to play with the distributions:
@interact(corr=(0, 1, 0.05),
lower_ocas=(0, 50, 1),
upper_ocas=(50, 100, 1),
lower_oes=(0, 50, 1),
upper_oes=(50, 100, 1),
)
def corrdata(corr=0.9,
lower_ocas=30, upper_ocas=85,
lower_oes=10, upper_oes=80):
"""Interactive correlation plot display."""
df = simpleCorrelation(corr, lower_ocas, upper_ocas, lower_oes, upper_oes)
sns.jointplot("OCAS", "OES", data=df,
kind="reg", truncate=False,
xlim=(0, 100), ylim=(0, 100),
color="blue", height=7)
boundaries.apply(overplot, axis=1);
interactive(children=(FloatSlider(value=0.9, description='corr', max=1.0, step=0.05), IntSlider(value=30, desc…
We can annotate the original marks dataframe with awards based on grade boundaries or just OCAS marks:
First, lets get a simple OCAS based grade:
ocas_bins = [0]+boundaries['ocas'].values[::-1].tolist()+[100]
df['ocas_grade'] = pd.cut(df['OCAS'], bins=ocas_bins,
labels=[5, 4, 3, 2, 1]).astype(int)
df
OCAS | OES | Rank | ocas_grade | |
---|---|---|---|---|
0 | 70 | 66 | 68.0 | 3 |
1 | 68 | 55 | 61.5 | 3 |
2 | 85 | 71 | 78.0 | 2 |
3 | 57 | 44 | 50.5 | 3 |
4 | 47 | 32 | 39.5 | 4 |
... | ... | ... | ... | ... |
995 | 72 | 59 | 65.5 | 2 |
996 | 68 | 55 | 61.5 | 3 |
997 | 80 | 67 | 73.5 | 2 |
998 | 53 | 39 | 46.0 | 4 |
999 | 67 | 57 | 62.0 | 3 |
1000 rows × 4 columns
The complete grade calculation is more complex (is there a more Pythonic way of doing this?):
def grader(row, enforce_rank=True):
"""Generate overall and OCAS only grades."""
for ix, boundary in boundaries.iterrows():
_rank = (row['OES']+row['OCAS'])/2
if (row['OES'] >= boundary['oes']) and (row['OCAS'] >= boundary['ocas']):
if enforce_rank:
if (row['Rank'] >= boundary['rank']):
return boundary['class']
else:
return boundary['class']
if (row['OES'] >= boundary['oes']) and _rank >= boundary['rank']:
return boundary['class']
elif (row['OCAS'] >= boundary['ocas']) and _rank >= boundary['rank']:
return boundary['class']
return 5
df['overall_grade'] = df.apply(grader, axis=1)
df
OCAS | OES | Rank | ocas_grade | overall_grade | |
---|---|---|---|---|---|
0 | 70 | 66 | 68.0 | 3 | 3 |
1 | 68 | 55 | 61.5 | 3 | 3 |
2 | 85 | 71 | 78.0 | 2 | 2 |
3 | 57 | 44 | 50.5 | 3 | 4 |
4 | 47 | 32 | 39.5 | 4 | 5 |
... | ... | ... | ... | ... | ... |
995 | 72 | 59 | 65.5 | 2 | 3 |
996 | 68 | 55 | 61.5 | 3 | 3 |
997 | 80 | 67 | 73.5 | 2 | 3 |
998 | 53 | 39 | 46.0 | 4 | 4 |
999 | 67 | 57 | 62.0 | 3 | 3 |
1000 rows × 5 columns
df['misgrade'] = df['ocas_grade'] < df['overall_grade']
df['misgrade'] = df['misgrade'].astype(int)
df
OCAS | OES | Rank | ocas_grade | overall_grade | misgrade | |
---|---|---|---|---|---|---|
0 | 70 | 66 | 68.0 | 3 | 3 | 0 |
1 | 68 | 55 | 61.5 | 3 | 3 | 0 |
2 | 85 | 71 | 78.0 | 2 | 2 | 0 |
3 | 57 | 44 | 50.5 | 3 | 4 | 1 |
4 | 47 | 32 | 39.5 | 4 | 5 | 1 |
... | ... | ... | ... | ... | ... | ... |
995 | 72 | 59 | 65.5 | 2 | 3 | 1 |
996 | 68 | 55 | 61.5 | 3 | 3 | 0 |
997 | 80 | 67 | 73.5 | 2 | 3 | 1 |
998 | 53 | 39 | 46.0 | 4 | 4 | 0 |
999 | 67 | 57 | 62.0 | 3 | 3 | 0 |
1000 rows × 6 columns
Update the original chart with coloured misgrades:
cmap = sns.cubehelix_palette(5, light=1, hue=1, as_cmap=True)
@interact(corr=(0, 1, 0.05),
lower_ocas=(0, 50, 1),
upper_ocas=(50, 100, 1),
lower_oes=(0, 50, 1),
upper_oes=(50, 100, 1),
enforce_rank = True
)
def corrdata(corr=0.9,
lower_ocas=30, upper_ocas=85,
lower_oes=10, upper_oes=80, enforce_rank=True):
"""Interactive correlation plot display."""
df = simpleCorrelation(corr, lower_ocas, upper_ocas, lower_oes, upper_oes)
df['ocas_grade'] = pd.cut(df['OCAS'], bins=ocas_bins,
labels=[9, 4, 3, 2, 1]).astype(int)
df['overall_grade'] = df.apply(grader, enforce_rank=enforce_rank, axis=1)
df['misgrade'] = df['ocas_grade'] < df['overall_grade']
df['misgrade'] = df['misgrade'].astype(int)
g = sns.jointplot("OCAS", "OES", data=df,
kind="reg", truncate=False,
xlim=(0, 100), ylim=(0, 100),
color="blue",
height=7, scatter = False)
g.ax_joint.scatter("OCAS", "OES", c="misgrade", marker="o", data=df,
s=10)
boundaries.apply(overplot, axis=1);
interactive(children=(FloatSlider(value=0.9, description='corr', max=1.0, step=0.05), IntSlider(value=30, desc…
We can also do a plot to show areas where grades are misapplied and by how many grades.
First, create a grid of all grades, calculate the "correct" grade and the grade derived purely from OCAS socres, and compare them:
#Generate every OCAS/OES combination
# Do we want to enforce the rank condition?
enforce_rank = True
all_scores = [(x, y) for x in range(1, 100) for y in range(1, 100)]
df_all = pd.DataFrame(all_scores, columns=['OCAS', 'OES'])
df_all['Rank'] = (df_all['OCAS'] + df_all["OES"]) / 2
df_all['ocas_grade'] = pd.cut(df_all['OCAS'], bins=ocas_bins,
labels=[5, 4, 3, 2, 1]).astype(int)
df_all['overall_grade'] = df_all.apply(grader, enforce_rank=enforce_rank, axis=1)
df_all['misgrade_amount'] = df_all['overall_grade'] - df_all['ocas_grade']
df_all['misgrade'] = (df_all['misgrade_amount']!=0)
Chart the result:
plt.rcParams["figure.figsize"] = (15, 15)
ax = sns.scatterplot(x="OCAS", y="OES", style='misgrade',
hue="misgrade_amount", data=df_all)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
boundaries.apply(overplot, axis=1);
Note that if we want interactive charts, they're easy enough to create, eg using the plotly
package:
import plotly.express as px
fig = px.scatter(x=df['OCAS'], y=df["OES"], color=df['misgrade'])
fig.show()